# -*- coding:utf8 -*-
# !/usr/bin/env python

"""
#全国企业信用信息公示系统（山东）
#维护黄羽
"""

import re
from scpy.logger import get_logger
import copy
import sys
import requests
from utils import kill_captcha
import hashlib
from table import index, report_index, table_clean, parse_time
import sd_trans_dict as TR
import sd_template_dict as TE
import sd_format as FO
import time
import traceback
import json

reload(sys)
sys.setdefaultencoding('utf8')

logger = get_logger(__file__)

UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"


def download_captcha_kill(companyName):
    """
    获取公司基本信息网页
    当验证码错误，或者验证码服务出现错误时，重复下载验证码并破解;
    在下载网页的过程中对方服务出现错误,重新该下载网页(目前的方式是重新破解验证码，重新下载)
    :param companyName: 公司名字或者注册号
    :return:None 或者　字符串
    若公司不存在,返回None;
    若公司存在返回公司基本信息网页;
    """
    if not companyName:
        raise ValueError("input error!")

    index_url = 'http://218.57.139.24/'
    index_header = {
        'Connection': 'keep-alive',
        'Host': '218.57.139.24',
        'User-Agent': UserAgent,
    }

    index_req = requests.session()
    index_req.headers = index_header
    index_res = index_req.get(index_url)
    index_set_cookie = (index_res.cookies.get_dict() or {}).get('JSESSIONID', '')
    if not index_set_cookie:
        return ''
        # raise Exception("cookie 获取失败！")

    logger.info("当前cookie为：%s", index_set_cookie)

    index_html = index_res.content

    _csrf = re.findall('<meta name="_csrf" content="(.+?)".*?/>', index_html)

    if _csrf:
        _csrf = _csrf[0]
    else:
        logger.error("网页发生变化！")
        raise Exception("网页发生变化！")

    img_url = 'http://218.57.139.24/securitycode?0.7169512815307826'

    # img_headers = {
    #     'Accept':'image/webp,image/*,*/*;q=0.8',
    #     'Accept-Encoding':'gzip, deflate, sdch',
    #     'Accept-Language':'zh-CN,zh;q=0.8',
    #     'Connection':'keep-alive',
    #     'Cookie':index_set_cookie,
    #     'Host':'218.57.139.24',
    #     'Referer':'http://218.57.139.24/',
    #     'User-Agent':UserAgent,
    # }
    img_req = index_req
    # img_req.headers = img_headers
    try:
        captcha = img_req.get(img_url, timeout=200).content
    except Exception, e:
        logger.error("从网站下载验证码失败！重复下载！")
        logger.error(e)
        raise Exception("download captcha error")
    if not captcha:
        logger.error("从网站下载验证码为空！重复下载！")
        return ''

    # with open('./sd.jpg', 'wb') as fp:
    #     fp.write(captcha)

    try:
        res_code = kill_captcha(captcha, 'sd', 'jpeg')
        # print 'res code: ', res_code
    except Exception, e:
        logger.error("破解验证码的服务出现异常")
        logger.error(e)
        raise e
    if not res_code or len(res_code) > 100 or str(res_code) in ['None', 'wrong']:
        logger.info('验证码为:%s' % res_code)
        logger.error("破解验证码的服务出现异常,可能是下载的验证码错误，也可能破解服务出现异常！")
        return ''  # 返回空字符串，用于重复破解
    # res_code = raw_input('base, code=')

    # MD5 加密
    m1 = hashlib.md5()
    m1.update(res_code)
    secode = m1.hexdigest()

    check_url = 'http://218.57.139.24/pub/indsearch'
    # check_headers = {
    #     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    #     'Accept-Encoding':'gzip, deflate',
    #     'Accept-Language':'zh-CN,zh;q=0.8',
    #     'Cache-Control':'max-age=0',
    #     'Connection':'keep-alive',
    #     'Content-Length':'230',
    #     'Content-Type':'application/x-www-form-urlencoded',
    #     'Cookie':index_set_cookie,
    #     'Host':'218.57.139.24',
    #     'Origin':'http://218.57.139.24',
    #     'Referer':'http://218.57.139.24/',
    #     'Upgrade-Insecure-Requests':'1',
    #     'User-Agent':UserAgent,
    # }

    check_data = {
        'kw': companyName,
        '_csrf': _csrf,
        'secode': secode,
    }
    check_req = img_req
    # check_req.headers = check_headers
    check_res = check_req.post(check_url, data=check_data, timeout=100).content
    # logger.info("网站返回：%s", check_res)
    # print check_res

    # 验证码错误重新破解
    # if re.findall('计算错误', check_res) or re.findall('经营异常名录', check_res):
    if re.findall('计算错误', check_res):
        # logger.error('check_res:%s' % check_res)
        logger.info("验证码错误！")
        return ''

    if re.findall('暂未查询到相关记录', check_res):
        logger.info("搜索的公司不存在！")
        return None
    elif not re.findall('href="(gsgsdetail.+?)"', check_res):
        # logger.error('check_res:%s' % check_res)
        return ''

    com_list = re.findall('href="(gsgsdetail.+?)"', check_res)

    return com_list[0]


def get_company_info(com_list):
    if not com_list:
        raise Exception("com_list 错误")
    raw_dict = {
        "province": "sd",
        "type": "3",
        "html": {},
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }

    encrpripid = re.findall('.+?/.+?/(.+)', com_list)
    com_type = re.findall('.+?/(.*?)/.*', com_list)

    if encrpripid and com_type:
        encrpripid = encrpripid[0]
        com_type = com_type[0]
    else:
        raise Exception("网站发生变化！")
    # 基本信息
    base_url = 'http://218.57.139.24/pub/' + com_list
    # 主要人员
    gsryxx_url = 'http://218.57.139.24/pub/gsryxx/' + com_type
    # 分支机构
    gsfzjg_url = 'http://218.57.139.24/pub/gsfzjg/' + com_type
    # 经营异常
    jyyc_url = 'http://218.57.139.24/pub/jyyc/' + com_type
    # 抽查检查
    ccjcxx_url = 'http://218.57.139.24/pub/ccjcxx'

    # asic_req = check_req
    # asic_req = RequestUtil()
    asic_req = requests.session()
    asic_req.headers = {'User-Agent': UserAgent}

    # asic_req.set_hreaders(asic_headers)
    # 基本信息
    # base_res = asic_req.make_request(base_url, method='get', timeout=20).content
    base_res = asic_req.get(base_url, timeout=20)
    base_set_cookie_fist = base_res.headers.get('set-cookie', '')
    if base_set_cookie_fist:
        base_set_cookie = base_set_cookie_fist.replace("Path=/", "").replace(",", "").replace(" ", "").replace(";", "")
    else:
        raise Exception("cookie 获取失败！")
    logger.info("当前cookie为：%s", base_set_cookie)
    base_content = base_res.content
    _csrf = re.findall('<meta name="_csrf" content="(.+?)".*?/>', base_content)
    if _csrf:
        _csrf = _csrf[0]
    else:
        logger.error("网页发生变化！")
        raise Exception("网页发生变化！")
    asic_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': base_set_cookie,
        'Host': '218.57.139.24',
        'Referer': 'http://218.57.139.24/pub/indsearch',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': UserAgent,
    }

    other_headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Content-Length': '75',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': base_set_cookie,
        'Host': '218.57.139.24',
        'Origin': 'http://218.57.139.24',
        'Referer': base_url,
        'X-CSRF-TOKEN': _csrf,
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': UserAgent,
    }
    other_data = {
        'encrpripid': encrpripid,
    }

    raw_dict["html"]['base_res'] = base_content
    czxxliststr = re.findall('var czxxliststr.*?\'(\[.*\])\';', base_content)
    raw_dict["html"]['czxxliststr'] = json.loads(czxxliststr[0]) if czxxliststr else []
    bgsxliststr = re.findall('var bgsxliststr.*?\'(\[.*\])\';', base_content)
    raw_dict["html"]['bgsxliststr'] = json.loads(bgsxliststr[0]) if bgsxliststr else []

    # asic_req.set_hreaders(other_headers)
    asic_req.headers = other_headers
    # # 主要人员,json
    # gsryxx_res = asic_req.make_request(gsryxx_url, data=other_data, method='post', timeout=20).content
    gsryxx_res = asic_req.post(gsryxx_url, data=other_data, timeout=20).content
    raw_dict["html"]['gsryxx_res'] = json.loads(gsryxx_res) if gsryxx_res else []
    # # 分支机构,json
    # gsfzjg_res = asic_req.make_request(gsfzjg_url, data=other_data, method='post', timeout=20).content
    gsfzjg_res = asic_req.post(gsfzjg_url, data=other_data, timeout=20).content
    raw_dict["html"]['gsfzjg_res'] = json.loads(gsfzjg_res) if gsfzjg_res else []
    # # 经营异常,json
    # jyyc_res = asic_req.make_request(jyyc_url, data=other_data, method='post', timeout=20).content
    jyyc_res = asic_req.post(jyyc_url, data=other_data, timeout=20).content
    raw_dict["html"]['jyyc_res'] = json.loads(jyyc_res) if jyyc_res else []
    # # 抽查检查,json
    # ccjcxx_res = asic_req.make_request(ccjcxx_url, data=other_data, method='post', timeout=20).content
    ccjcxx_res = asic_req.post(ccjcxx_url, data=other_data, timeout=20).content
    raw_dict["html"]['ccjcxx_res'] = json.loads(ccjcxx_res) if ccjcxx_res else []
    # 年报部分
    year_page_url = 'http://218.57.139.24/pub/qygsdetail/%s/%s' % (com_type, encrpripid)
    # year_page_res = asic_req.make_request(year_page_url, method='get', timeout=20).content
    year_page_res = asic_req.get(year_page_url, timeout=20).content
    asic_req.close()
    year_page_list = re.findall('<a href="(/pub/nb/.*?)".*?(\d+?)年度报告', year_page_res, re.S)
    raw_year_list = []

    year_report_header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': base_set_cookie,
        'Host': '218.57.139.24',
        'Referer': year_page_url,
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': UserAgent,
    }
    # year_req = RequestUtil()
    year_req = requests.session()
    year_req.headers = {'User-Agent': UserAgent}
    if year_page_list:
        for item in year_page_list:
            raw_year_dict = {}
            if not item or len(item) != 2:
                continue
            report_url = 'http://218.57.139.24' + item[0]
            # year_req.set_hreaders(year_report_header)
            year_req.headers = year_report_header
            try:
                # year_req.make_request(report_url, method='get', timeout=8).content
                year_req.get(report_url, timeout=8).content
            except Exception, e:
                # logger.exception(e)
                logger.info("需要访问两次，第一次会timeout")
                pass
            # report_res = year_req.make_request(report_url, method='get', timeout=20).content
            report_res = year_req.get(report_url, timeout=20).content

            raw_year_dict['year'] = item[1]
            raw_year_dict['html'] = report_res
            raw_year_list.append(raw_year_dict)

        raw_dict["yearList"] = raw_year_list

    return raw_dict


def extract_base_info(raw_dict):
    if not raw_dict:
        return None

    res_dict = copy.deepcopy(TE.void_base_dict)
    # 基本信息
    # re.findall('var czxxliststr.*?\'(\[.*\])\';', raw_dict["html"]['base_res'])
    base_table = table_clean(raw_dict["html"]['base_res'], '基本信息')
    if base_table:
        basic_list = index("基本信息", base_table)
        res_dict['basicList'] = basic_list
        if res_dict['basicList']:
            res_dict["province"] = "sd"

    raw_share_holder = raw_dict["html"]['czxxliststr']
    # 股东信息
    share_holder_list = []
    for item in raw_share_holder:
        share_holder_list.append(FO.transform_dict(TE.shareHolder_dict, TR.shareHolder_dict, item))
    res_dict['shareHolderList'] = share_holder_list
    # 变更信息
    raw_alter = raw_dict["html"]['bgsxliststr']
    alter_list = []
    for item in raw_alter:
        alter_dict = FO.transform_dict(TE.alter_dict, TR.alter_dict, item)
        altdate = time.localtime(item['altdate']['time'] / 10 ** 3) if item['altdate'] else ''
        alter_dict['altDate'] = time.strftime("%Y-%m-%d %H:%M:%S", altdate) if altdate else ''
        alter_list.append(alter_dict)
    res_dict['alterList'] = alter_list

    # 主要人员
    raw_person = raw_dict["html"]['gsryxx_res']
    person_list = []
    for item in raw_person:
        person_list.append(FO.transform_dict(TE.person_dict, TR.person_dict, item))
    res_dict['personList'] = person_list

    # 分支机构
    raw_filiation = raw_dict["html"]['gsfzjg_res']
    filiation_list = []
    for item in raw_filiation:
        filiation_list.append(FO.transform_dict(TE.filiation_dict, TR.filiation_dict, item))
    res_dict['filiationList'] = filiation_list

    # 经营异常
    raw_abnormal = raw_dict["html"]['jyyc_res']
    abnormal_list = []
    for item in raw_abnormal:
        abnormal = FO.transform_dict(TE.abnormalOperation_dict, TR.abnormalOperation_dict, item)
        abntime = time.localtime(item['abntime']['time'] / 10 ** 3) if item['abntime'] and item['abntime'][
            'time'] else ''
        abnormal['abntime'] = time.strftime("%Y-%m-%d %H:%M:%S", abntime) if abntime else ''

        remdate = time.localtime(item['remdate']['time'] / 10 ** 3) if item['remdate'] and item['remdate'][
            'time'] else ''
        abnormal['retime'] = time.strftime("%Y-%m-%d %H:%M:%S", remdate) if remdate else ''
        abnormal_list.append(abnormal)
    res_dict['abnormalOperation'] = abnormal_list

    # 抽查检查
    raw_check = raw_dict["html"]['ccjcxx_res']
    check_list = []
    for item in raw_check:
        check_dict = FO.transform_dict(TE.checkMessage_dict, TR.checkMessage_dict, item)
        insdate = time.localtime(item['insdate']['time'] / 10 ** 3) if item['insdate'] and item['insdate'][
            'time'] else ''
        check_dict['check_date'] = time.strftime("%Y-%m-%d %H:%M:%S", insdate) if insdate else ''
        check_list.append(check_dict)
    res_dict['checkMessage'] = check_list

    return res_dict


def extract_year_info(raw_dict):
    # 年报
    if not raw_dict:
        return None
    raw_year_list = raw_dict["yearList"]
    if not raw_year_list:
        return []
    yearList = []
    for raw_year in raw_year_list:
        year_dict = copy.deepcopy(TE.void_year_dict)

        # 基本信息
        year_base_table = table_clean(raw_year['html'], '企业基本信息') + table_clean(raw_year['html'], "基本信息")
        year_dict['baseInfo'] = report_index('企业基本信息', year_base_table) if year_base_table else {}

        # 网站信息
        wdxxliststr = re.findall('var wdxxliststr.*?\'(\[.*\])\';', raw_year['html'])
        raw_web = json.loads(wdxxliststr[0]) if wdxxliststr else []
        year_dict['website'] = FO.transform_dict(TE.website_dict, TR.website_dict, raw_web[0]) if raw_web else {}

        # 出资信息
        czxxliststr = re.findall('var czxxliststr.*?\'(\[.*\])\';', raw_year['html'])
        raw_investor = json.loads(czxxliststr[0]) if czxxliststr else []
        investorInformations_list = []
        for item in raw_investor:
            investor_dict = FO.transform_dict(TE.investorInformations_dict, TR.investorInformations_dict,
                                              item) if item else {}
            subcondate = time.localtime(item['subcondate']['time'] / 10 ** 3) if item['subcondate'] and \
                                                                                 item['subcondate']['time'] else ''
            investor_dict['subConDate'] = time.strftime("%Y-%m-%d %H:%M:%S", subcondate) if subcondate else ''

            accondate = time.localtime(item['accondate']['time'] / 10 ** 3) if item['accondate'] and item['accondate'][
                'time'] else ''
            investor_dict['paidTime'] = time.strftime("%Y-%m-%d %H:%M:%S", accondate) if accondate else ''
            investorInformations_list.append(investor_dict)
        year_dict['investorInformations'] = investorInformations_list

        # 企业资产状况信息
        year_assets_table = table_clean(raw_year['html'], '企业资产状况信息')
        year_dict['assetsInfo'] = report_index('企业资产状况信息', year_assets_table) if year_assets_table else {}

        # 修改记录
        nbalthisstr = re.findall('var nbalthisstr.*?\'(\[.*\])\';', raw_year['html'])
        raw_change = json.loads(nbalthisstr[0]) if nbalthisstr else []
        change_list = []
        for item in raw_change:
            change_dict = FO.transform_dict(TE.changeRecords_dict, TR.changeRecords_dict, item) if item else {}

            altdate = time.localtime(item['altdate']['time'] / 10 ** 3) if item['altdate'] and item['altdate'][
                'time'] else ''
            change_dict['time'] = time.strftime("%Y-%m-%d %H:%M:%S", altdate) if altdate else ''

            change_list.append(change_dict)
        year_dict['changeRecords'] = change_list

        year_dict['year'] = raw_year['year']

        yearList.append(year_dict)
    return yearList


def search2(companyName, MAXTIME=40):
    res = ''
    asic_dict = {}
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                # time.sleep(10)
                res = download_captcha_kill(companyName)
                # print res
            except Exception, e:
                traceback.print_exc(e)
                raise e
        else:
            break
    com_list = res
    res = get_company_info(com_list)
    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        try:
            raw_dict = res
            asic_dict = extract_base_info(raw_dict)
            year_list = extract_year_info(raw_dict)

            company_name = asic_dict['basicList'][0].get('enterpriseName', '')
            company_name = company_name if company_name else companyName
            res['companyName'] = company_name

            asic_dict['yearReportList'] = year_list
            gate_method = {
                'url': 'http://218.57.139.24/pub/',
                'method': 'get',
                'province': 'sd',
                'companyName': company_name,
                'data': com_list,
            }

            return res, asic_dict, gate_method
        except Exception, e:
            traceback.print_exc(e)
            logger(e)

            res['companyName'] = companyName
            gate_method = {
                'url': 'http://218.57.139.24/pub/',
                'method': 'get',
                'province': 'sd',
                'companyName': companyName,
                'data': com_list,
            }
            return res, None, gate_method


def search(companyName):
    res = search2(companyName)
    if not res:
        return None
    else:
        return res[1]


def search3(gate_method):
    if 'data' not in gate_method:
        raise Exception("gate_method error, doesn't have `data` key")
    com_list = gate_method.get('data')
    res = get_company_info(com_list)

    raw_dict = res
    asic_dict = extract_base_info(raw_dict)
    year_list = extract_year_info(raw_dict)
    res['companyName'] = asic_dict['basicList'][0].get('enterpriseName', '')

    asic_dict['yearReportList'] = year_list
    gate_method = {
        'url': 'http://218.57.139.24/pub/',
        'method': 'get',
        'province': 'sd',
        'companyName': asic_dict['basicList'][0].get('enterpriseName', ''),
        'data': com_list,
    }

    return res, asic_dict, gate_method


if __name__ == '__main__':
    # companyName = '山东石大胜华化工集团股份有限公司'
    # companyName = '山东石大胜华化工集团股份有限公司垦利分公司'
    # companyName = '山东东佳集团股份有限公司'
    # companyName = '合力泰科技股份有限公司'
    # companyName = '山东祥和集团股份有限公司博山微电机厂'
    companyName = '山东利达丰华农资有限公司'
    # companyName = '菏泽开发区冠森配货信息咨询部'
    # companyName = '青岛立平商赞商贸有限公司'
    # companyName = '青岛嘉泰盛市政工程有限公司'
    # companyName = '山东嘉隆新型材料有限公司'
    # companyName = '猪八戒网络'
    # res = get_company_info(companyName)
    # extract_base_info(res)
    # extract_year_info(res)
    res = search2(companyName)
    # res = search(companyName)
    print json.dumps(res, indent=4, ensure_ascii=False)



    # import pymongo
    # import json
    # # pymongo.MongoClient('192.168.31.121', 27017)
    # clientServer = pymongo.MongoClient('192.168.31.121',27017)
    # db = clientServer.crawler_company_name
    # collectionServer = db.companyName
    # reg_no_s = collectionServer.find({'province': 'sd'}).limit(100)
    # for reg in reg_no_s:
    #     print '#'*10
    #     print reg
    #     print '#'*10
    #     reg_no = reg['regNo']
    #     try:
    #         res = search(reg_no)
    #         print json.dumps(res, indent=4, ensure_ascii=False)
    #     except Exception, e:
    #         import traceback
    #         traceback.print_exc()
    #         # print reg
    #         # import pdb
    #         # pdb.set_trace()
    #
